#include "opencl_source_map.hpp" 
namespace MNN { 
#ifndef MNN_OPENCL_BUFFER_CLOSED
#ifdef MNN_SUPPORT_INTEL_SUBGROUP
const char* depthwise_conv2d_subgroup_buf = 
"#ifdef MNN_SUPPORT_FP16\n"
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
"#endif\n"
"#pragma OPENCL EXTENSION cl_intel_subgroups : enable\n"
"__attribute__((intel_reqd_sub_group_size(16)))\n"
"__kernel void depthwise_conv_2d_buf_c16_c16(\n"
" __global FLOAT* input,\n"
" __global FLOAT* output,\n"
" __global FLOAT* weights,\n"
" __global FLOAT* biases,\n"
" __private const int inputHeight,\n"
" __private const int inputWidth,\n"
" __private const int Channel,\n"
" __private const int Batch,\n"
" __private const int input_pad_left,\n"
" __private const int input_pad_right,\n"
" __private const int outputHeight,\n"
" __private const int outputWidth,\n"
" __private const int output_pad_left,\n"
" __private const int output_pad_right,\n"
" __private const int pad_w,\n"
" __private const int pad_h\n"
") {\n"
" const int x_blocks=(outputWidth+7)/8;\n"
" const int sglid=get_sub_group_local_id();\n"
" const int b=get_global_id(2);\n"
" const int xy=get_global_id(0);\n"
" const int x=(xy % x_blocks)*8;\n"
" const int y=(xy/x_blocks);\n"
" const int c=get_group_id(1);\n"
" const int input_x=x*STRIDE_WIDTH-pad_w;\n"
" const int input_y=y*STRIDE_HEIGHT-pad_h;\n"
" const int channel_pack=((Channel+15)/16);\n"
" const uint input_x_pitch=16;\n"
" const uint input_y_pitch=input_x_pitch*(inputWidth+input_pad_left+input_pad_right);\n"
" const uint input_fs_pitch=input_y_pitch*(inputHeight);\n"
" const uint input_b_pitch=input_fs_pitch*channel_pack;\n"
" const uint input_offset=b*input_b_pitch +\n"
" c*input_fs_pitch+\n"
" input_y*input_y_pitch +\n"
" (input_x+input_pad_left)*input_x_pitch;\n"
" const uint output_x_pitch=16;\n"
" const uint output_y_pitch=output_x_pitch*(outputWidth+output_pad_left+output_pad_right);\n"
" const uint output_fs_pitch=output_y_pitch*outputHeight;\n"
" const uint output_b_pitch=output_fs_pitch*channel_pack;\n"
" const uint output_offset=b*output_b_pitch +\n"
" c*output_fs_pitch +\n"
" y*output_y_pitch +\n"
" (x+output_pad_left)*output_x_pitch;\n"
" const uint filter_x_pitch=16;\n"
" const uint filter_y_pitch=filter_x_pitch*FILTER_WIDTH;\n"
" const uint filter_is_pitch=filter_y_pitch*FILTER_HEIGHT;\n"
" const uint filter_offset=c*filter_is_pitch;\n"
"#ifdef MNN_SUPPORT_FP16\n"
" COMPUTE_FLOAT8 dst=(COMPUTE_FLOAT8)(as_half(intel_sub_group_block_read_us((__global ushort*)(biases+c*16))));\n"
" for(int i=0; i<FILTER_HEIGHT; ++i){\n"
" if ((input_y+i*DILATION_HEIGHT)<0 || (input_y+i*DILATION_HEIGHT) >= inputHeight)\n"
" continue;\n"
" for(int j=0; j<FILTER_WIDTH; ++j){\n"
" COMPUTE_FLOAT wei=as_half(intel_sub_group_block_read_us((__global ushort*)(weights+filter_offset+i*filter_y_pitch+j*filter_x_pitch)));\n"
" for(int k=0; k<8; ++k){\n"
" COMPUTE_FLOAT src=as_half(intel_sub_group_block_read_us((__global ushort*)(input+input_offset+i*DILATION_HEIGHT*input_y_pitch+(j*DILATION_WIDTH+k*STRIDE_WIDTH)*input_x_pitch)));\n"
" dst[k]=mad(src,wei,dst[k]);\n"
" }\n"
" }\n"
" }\n"
" \n"
"#else\n"
" COMPUTE_FLOAT8 dst=(COMPUTE_FLOAT8)(as_float(intel_sub_group_block_read((__global uint*)(biases+c*16))));\n"
" for(int i=0; i<FILTER_HEIGHT; ++i){\n"
" if ((input_y+i*DILATION_HEIGHT)<0 || (input_y+i*DILATION_HEIGHT) >= inputHeight)\n"
" continue;\n"
" for(int j=0; j<FILTER_WIDTH; ++j){\n"
" COMPUTE_FLOAT wei=as_float(intel_sub_group_block_read((__global ushort*)(weights+filter_offset+i*filter_y_pitch+j*filter_x_pitch)));\n"
" for(int k=0; k<8; ++k){\n"
" COMPUTE_FLOAT src=as_float(intel_sub_group_block_read((__global ushort*)(input+input_offset+i*DILATION_HEIGHT*input_y_pitch+(j*DILATION_WIDTH+k*STRIDE_WIDTH)*input_x_pitch)));\n"
" dst[k]=mad(src,wei,dst[k]);\n"
" }\n"
" }\n"
" }\n"
"#endif\n"
"#ifdef RELU\n"
" dst=fmax(dst,(COMPUTE_FLOAT8)0);\n"
"#endif\n"
"#ifdef RELU6\n"
" dst=clamp(dst,(COMPUTE_FLOAT8)0,(COMPUTE_FLOAT8)6);\n"
"#endif\n"
" \n"
" for (int i=0; i<8 && (x+i)<outputWidth; i++) {\n"
"#ifdef MNN_SUPPORT_FP16\n"
" intel_sub_group_block_write_us((__global ushort*)(output+output_offset+i*output_x_pitch),as_ushort((FLOAT)dst[i]));\n"
"#else\n"
" intel_sub_group_block_write((__global uint*)(output+output_offset+i*output_x_pitch),as_uint((FLOAT)dst[i]));\n"
"#endif\n"
" }\n"
" if(x == 0){\n"
" uint pad_offset=b*output_b_pitch+c*output_fs_pitch+y*output_y_pitch;\n"
" for(int i=0; i<output_pad_left; ++i){\n"
" output[pad_offset+i*output_x_pitch+sglid]=0;\n"
" }\n"
" pad_offset += (outputWidth+output_pad_left)*output_x_pitch;\n"
" for(int i=0; i<output_pad_right; ++i){\n"
" output[pad_offset+i*output_x_pitch+sglid]=0;\n"
" }\n"
" }\n"
"}\n"
"__attribute__((intel_reqd_sub_group_size(16)))\n"
"__kernel void depthwise_conv_2d_buf_c16_c4(\n"
" __global FLOAT* input,\n"
" __global FLOAT* output,\n"
" __global FLOAT* weights,\n"
" __global FLOAT* biases,\n"
" __private const int inputHeight,\n"
" __private const int inputWidth,\n"
" __private const int Channel,\n"
" __private const int Batch,\n"
" __private const int input_pad_left,\n"
" __private const int input_pad_right,\n"
" __private const int outputHeight,\n"
" __private const int outputWidth,\n"
" __private const int output_pad_left,\n"
" __private const int output_pad_right,\n"
" __private const int pad_w,\n"
" __private const int pad_h\n"
") {\n"
" const int x_blocks=(outputWidth+7)/8;\n"
" const int sglid=get_sub_group_local_id();\n"
" const int b=get_global_id(2);\n"
" const int xy=get_global_id(0);\n"
" const int x=(xy % x_blocks)*8;\n"
" const int y=(xy/x_blocks);\n"
" const int c=get_group_id(1);\n"
" const int input_x=x*STRIDE_WIDTH-pad_w;\n"
" const int input_y=y*STRIDE_HEIGHT-pad_h;\n"
" const int channel_pack=((Channel+15)/16);\n"
" const uint input_x_pitch=16;\n"
" const uint input_y_pitch=input_x_pitch*(inputWidth+input_pad_left+input_pad_right);\n"
" const uint input_fs_pitch=input_y_pitch*(inputHeight);\n"
" const uint input_b_pitch=input_fs_pitch*channel_pack;\n"
" const uint input_offset=b*input_b_pitch +\n"
" c*input_fs_pitch+\n"
" input_y*input_y_pitch +\n"
" (input_x+input_pad_left)*input_x_pitch;\n"
" const uint output_x_pitch=4;\n"
" const uint output_y_pitch=output_x_pitch*outputWidth;\n"
" const uint output_fs_pitch=output_y_pitch*outputHeight;\n"
" const uint output_b_pitch=output_fs_pitch*Batch;\n"
" const uint output_offset=(c << 2)*output_b_pitch +\n"
" b*output_fs_pitch +\n"
" y*output_y_pitch +\n"
" x*output_x_pitch;\n"
" const uint filter_x_pitch=16;\n"
" const uint filter_y_pitch=filter_x_pitch*FILTER_WIDTH;\n"
" const uint filter_is_pitch=filter_y_pitch*FILTER_HEIGHT;\n"
" const uint filter_offset=c*filter_is_pitch;\n"
"#ifdef MNN_SUPPORT_FP16\n"
" COMPUTE_FLOAT8 dst=(COMPUTE_FLOAT8)(as_half(intel_sub_group_block_read_us((__global ushort*)(biases+c*16))));\n"
" for(int i=0; i<FILTER_HEIGHT; ++i){\n"
" if ((input_y+i*DILATION_HEIGHT)<0 || (input_y+i*DILATION_HEIGHT) >= inputHeight)\n"
" continue;\n"
" for(int j=0; j<FILTER_WIDTH; ++j){\n"
" COMPUTE_FLOAT wei=as_half(intel_sub_group_block_read_us((__global ushort*)(weights+filter_offset+i*filter_y_pitch+j*filter_x_pitch)));\n"
" for(int k=0; k<8; ++k){\n"
" COMPUTE_FLOAT src=as_half(intel_sub_group_block_read_us((__global ushort*)(input+input_offset+i*DILATION_HEIGHT*input_y_pitch+(j*DILATION_WIDTH+k*STRIDE_WIDTH)*input_x_pitch)));\n"
" dst[k]=mad(src,wei,dst[k]);\n"
" }\n"
" }\n"
" }\n"
" \n"
"#else\n"
" COMPUTE_FLOAT8 dst=(COMPUTE_FLOAT8)(as_float(intel_sub_group_block_read((__global uint*)(biases+c*16))));\n"
" for(int i=0; i<FILTER_HEIGHT; ++i){\n"
" if ((input_y+i*DILATION_HEIGHT)<0 || (input_y+i*DILATION_HEIGHT) >= inputHeight)\n"
" continue;\n"
" for(int j=0; j<FILTER_WIDTH; ++j){\n"
" COMPUTE_FLOAT wei=as_float(intel_sub_group_block_read((__global ushort*)(weights+filter_offset+i*filter_y_pitch+j*filter_x_pitch)));\n"
" for(int k=0; k<8; ++k){\n"
" COMPUTE_FLOAT src=as_float(intel_sub_group_block_read((__global ushort*)(input+input_offset+i*DILATION_HEIGHT*input_y_pitch+(j*DILATION_WIDTH+k*STRIDE_WIDTH)*input_x_pitch)));\n"
" dst[k]=mad(src,wei,dst[k]);\n"
" }\n"
" }\n"
" }\n"
"#endif\n"
"#ifdef RELU\n"
" dst=fmax(dst,(COMPUTE_FLOAT8)0);\n"
"#endif\n"
"#ifdef RELU6\n"
" dst=clamp(dst,(COMPUTE_FLOAT8)0,(COMPUTE_FLOAT8)6);\n"
"#endif\n"
" const uint lid_x=sglid % 4;\n"
" const uint lid_y=sglid/4;\n"
" for (int i=0; i<8 && (x+i)<outputWidth; i++) {\n"
" output[output_offset+lid_y*output_b_pitch+i*output_x_pitch+lid_x]=dst[i];\n"
" }\n"
"}\n"
;
#endif
#endif
}
